library(modeltime)
library(dplyr)
library(EIAapi)
library(jsonlite)
library(gt)
library(plotly)
library(lubridate)
library(modeltime)
source("../pipeline/eia_data.R")
source("../pipeline/backtesting.R")Data Refrsh
Load libraries
API Settings:
meta_json <- read_json(path = "../settings/settings.json")
s <- meta_json$series
series <- lapply(1:length(s), function(i) {
return(data.frame(
parent_id = s[[i]]$parent_id,
parent_name = s[[i]]$parent_name,
subba_id = s[[i]]$subba_id,
subba_name = s[[i]]$subba_name
))
}) |>
bind_rows()
facets_template <- list(
parent = NULL,
subba = NULL
)
eia_api_key <- Sys.getenv("EIA_API_KEY")
api_path <- meta_json$api_path
meta_path <- meta_json$meta_path
data_path <- meta_json$data_path
forecast_path <- meta_json$forecast_path
forecast_log_path <- meta_json$forecast_log_path
calibrated_models_path <- meta_json$calibrated_models_path
h <- meta_json$backtesting$h
lags <- meta_json$backtesting$features$lags |> unlist()
train_length <- meta_json$train_length
offset <- meta_json$offset
tz <- meta_json$timezone
models_settings <- meta_json$backtesting$models
init <- FALSE
save <- TRUEmeta_obj <- get_metadata(api_key = eia_api_key, api_path = api_path, meta_path = meta_path, series = series)
gt(meta_obj$request_meta)| parent | subba | end_act | request_start | end | updates_available |
|---|---|---|---|---|---|
| CISO | PGAE | 2024-07-26 07:00:00 | 2024-07-26 08:00:00 | 2024-07-26 07:00:00 | FALSE |
| CISO | SCE | 2024-07-26 07:00:00 | 2024-07-26 08:00:00 | 2024-07-26 07:00:00 | FALSE |
| CISO | SDGE | 2024-07-26 07:00:00 | 2024-07-26 08:00:00 | 2024-07-26 07:00:00 | FALSE |
| CISO | VEA | 2024-07-26 07:00:00 | 2024-07-26 08:00:00 | 2024-07-26 07:00:00 | FALSE |
m <- meta_obj$request_meta
index <- meta_obj$last_index + 1
data <- NULL
meta_new <- NULL
for (i in 1:nrow(m)) {
facets <- facets_template
facets$parent <- m$parent[i]
facets$subba <- m$subba[i]
start <- m$request_start[i]
end <- m$end[i]
print(paste(facets$parent, facets$subba, sep = " - "))
if (m$updates_available[i]) {
temp <- eia_backfill(
start = start - lubridate::hours(24),
end = end + lubridate::hours(24),
offset = offset,
api_key = eia_api_key,
api_path = paste(api_path, "data", sep = ""),
facets = facets
) |> dplyr::filter(time >= start & time <= end)
index <- seq.POSIXt(from = start, to = end, by = "hour")
ts_obj <- data.frame(period = index) |>
left_join(temp, by = c("period" = "time"))
data_available <- TRUE
} else {
ts_obj <- NULL
print("No new data is available")
data_available <- FALSE
}
meta_temp <- create_metadata(data = ts_obj, start = start, end = end, type = "refresh")
if (is.null(ts_obj)) {
meta_temp$parent <- m$parent[i]
meta_temp$subba <- m$subba[i]
}
if (meta_temp$success) {
print("Append the new data")
d <- append_data(data_path = data_path, new_data = ts_obj, save = TRUE)
meta_temp$update <- TRUE
} else {
meta_temp$update <- FALSE
if (data_available && !meta_temp$success) {
meta_temp$comments <- paste(meta_temp$comments, "The data refresh failed, please check the log; ", sep = "")
}
}
meta_temp$index <- NA
meta_df <- as.data.frame(meta_temp)
if (!is.null(ts_obj)) {
data <- bind_rows(data, ts_obj)
}
meta_new <- bind_rows(meta_new, meta_df)
}[1] "CISO - PGAE"
[1] "No new data is available"
[1] "CISO - SCE"
[1] "No new data is available"
[1] "CISO - SDGE"
[1] "No new data is available"
[1] "CISO - VEA"
[1] "No new data is available"
gt(meta_new)| index | parent | subba | time | start | end | start_act | end_act | start_match | end_match | n_obs | na | type | update | success | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | CISO | PGAE | 2024-07-27 12:20:43.477134 | 2024-07-26 08:00:00 | 2024-07-26 07:00:00 | NA | NA | NA | NA | NA | NA | refresh | FALSE | FALSE | No new data is available; |
| NA | CISO | SCE | 2024-07-27 12:20:43.478746 | 2024-07-26 08:00:00 | 2024-07-26 07:00:00 | NA | NA | NA | NA | NA | NA | refresh | FALSE | FALSE | No new data is available; |
| NA | CISO | SDGE | 2024-07-27 12:20:43.480118 | 2024-07-26 08:00:00 | 2024-07-26 07:00:00 | NA | NA | NA | NA | NA | NA | refresh | FALSE | FALSE | No new data is available; |
| NA | CISO | VEA | 2024-07-27 12:20:43.481455 | 2024-07-26 08:00:00 | 2024-07-26 07:00:00 | NA | NA | NA | NA | NA | NA | refresh | FALSE | FALSE | No new data is available; |
meta_updated <- append_metadata(meta_path = meta_path, new_meta = meta_new, save = TRUE, init = FALSE)[1] "Saving the metadata file"
Plot the Series
We will use Plotly to visualize the series:
if (!is.null(data)) {
d <- data |> arrange(subba, period)
p <- plot_ly(d, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")
p
} else {
print("No new data is available")
}[1] "No new data is available"
data <- readr::read_csv(file = data_path, col_types = readr::cols(
period = readr::col_datetime(format = ""),
subba = readr::col_character(),
subba_name = readr::col_character(),
parent = readr::col_character(),
parent_name = readr::col_character(),
value = readr::col_double(),
value_units = readr::col_character()
))
head(data)# A tibble: 6 x 8
period subba subba_name parent parent_name value value_units
<dttm> <chr> <chr> <chr> <chr> <dbl> <chr>
1 2018-07-01 08:00:00 PGAE Pacific Gas an~ CISO California~ 12522 megawattho~
2 2018-07-01 09:00:00 PGAE Pacific Gas an~ CISO California~ 11745 megawattho~
3 2018-07-01 10:00:00 PGAE Pacific Gas an~ CISO California~ 11200 megawattho~
4 2018-07-01 11:00:00 PGAE Pacific Gas an~ CISO California~ 10822 megawattho~
5 2018-07-01 12:00:00 PGAE Pacific Gas an~ CISO California~ 10644 megawattho~
6 2018-07-01 13:00:00 PGAE Pacific Gas an~ CISO California~ 10559 megawattho~
# i 1 more variable: type <chr>
p <- plot_ly(data, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")
pRefresh the forecast
fc <- NULL
fc <- refresh_forecast(
input = data,
forecast_log_path = forecast_log_path,
forecast_path = forecast_path,
models_settings = models_settings,
h = h,
index = "period",
var = "value",
train_length = 24 * 31 * 25,
lags = lags,
init = init,
save = save,
seasonal = TRUE,
trend = TRUE
)Checking if new data points are available to refresh the forecast
There is no sufficent data to refresh the froecast
if (!is.null(fc)) {
head(fc)
plot_forecast(
input = data,
forecast = fc,
var = "value",
index = "period",
hours = 24 * 3
)
}Score the Forecast
fc_log <- load_forecast_log(forecast_log_path = forecast_log_path)
score_rows <- which(!fc_log$score)
if (length(score_rows) == 0) {
message("All models were scored ")
} else {
subba <- unique(fc_log$subba[score_rows])
fc <- load_forecast(forecast_path = forecast_path)
for (i in subba) {
print(i)
d <- data |> dplyr::filter(subba == i)
r <- which(fc_log$subba == i & !fc_log$score)
for (l in r) {
f <- fc |>
dplyr::filter(forecast_label == fc_log$forecast_label[l], subba == i) |>
dplyr::left_join(d |> dplyr::select(time = period, subba, value), by = c("time", "subba")) |>
dplyr::filter(!is.na(value))
fc_log$mape[l] <- mean(abs(f$value - f$yhat) / f$value)
fc_log$rmse[l] <- (mean((f$value - f$yhat)^2))^0.5
fc_log$coverage[l] <- length(which(f$value <= f$upper & f$value >= f$lower)) / nrow(f)
if (nrow(f) == fc_log$h[l]) {
fc_log$score[l] <- TRUE
}
write.csv(fc_log, forecast_log_path, row.names = FALSE)
}
}
gt::gt(fc_log[score_rows, ])
}[1] "PGAE"
[1] "SCE"
[1] "SDGE"
[1] "VEA"
| index | subba | model | method | time | forecast_label | start | end | h | n_obs | n_obs_flag | na_flag | success | score | mape | rmse | coverage |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 45 | PGAE | LM | model6 | 2024-07-26 15:12:32.064844 | 2024-07-26 | 2024-07-26 | 2024-07-26 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.01015704 | 200.71928 | 1.000 |
| 46 | SCE | GLMNET | model18 | 2024-07-26 15:12:32.067201 | 2024-07-26 | 2024-07-26 | 2024-07-26 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.04278161 | 977.02378 | 1.000 |
| 47 | SDGE | LM | model7 | 2024-07-26 15:12:32.069602 | 2024-07-26 | 2024-07-26 | 2024-07-26 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.10423522 | 369.37307 | 0.750 |
| 48 | VEA | LM | model5 | 2024-07-26 15:12:32.071982 | 2024-07-26 | 2024-07-26 | 2024-07-26 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.07354985 | 15.60291 | 0.875 |